#!/usr/bin/python3

"""
This script reads the CSV output from ssdeep in the malicious_apk
and benign_apk folders and writes similarity scores for each sample
and classifications to a JSON file for later analysis

The output data format is as follows:
{"features": ["similarity_limit_0", "similarity_limit_0.2", ...],
 "apps": {"999eca2457729e371355aea5faa38e14.apk": {"vector": [0,0,0,1], "malicious": [0,1]}, ...}}
"""

import os
import json
import glob
import random
import numpy
import ssdeep

__author__='mwleeds'

def main():
    all_hashes = {'malicious': [], 'benign': []}
    app_malicious_map = {} # mapping from android app names to 1 or 0 for malware or goodware
    similarity_buckets = ['similarity_limit_0', 'similarity_limit_0.2', 'similarity_limit_0.4', 'similarity_limit_0.6', 'similarity_limit_0.8', 'similarity_limit_1.0']
    root_dir = os.getcwd()
    for i, directory in enumerate(['benign_apk', 'malicious_apk']):
        os.chdir(directory)
        with open(directory.split('_')[0] + '_apk_ssdeep.csv') as hashes:
            for j, line in enumerate(hashes):
                if j == 0: continue
                b64hash = line.split(',')[0]
                app_name = line.split(',')[-1].split('/')[-1][:-2]
                app_malicious_map[app_name] = [1,0] if i else [0,1]
                all_hashes['malicious' if i else 'benign'].append((app_name, b64hash))
        os.chdir(root_dir)
    all_apps = {} # mapping from each app to its similarity score and classification
    num_zero = {}
    num_each = {}
    for category in all_hashes:
        num_zero[category] = 0
        num_each[category] = 0
        for app_and_hash in all_hashes[category]:
            similarity_scores = []
            this_score = app_and_hash[1]
            for i in range(1000):
                other_score = random.choice(all_hashes[category])[1]
                similarity_scores.append(ssdeep.compare(this_score, other_score))
            score = numpy.mean(similarity_scores)
            num_each[category] += 1
            if score == 0: num_zero[category] += 1
            bit_vector = []
            last_limit = -0.01
            for limit in similarity_buckets:
                float_limit = float(limit.split('_')[-1])
                if score <= float_limit and score > last_limit:
                    bit_vector.append(1)
                else:
                    bit_vector.append(0)
                last_limit = float_limit
            if not any(bit_vector): # score > 1
                bit_vector[-1] = 1
            all_apps[app_and_hash[0]] = {'vector': bit_vector, 'malicious': app_malicious_map[app_and_hash[0]]}
    with open('app_hash_vectors.json', 'w') as outfile:
        json.dump({'features': similarity_buckets, 'apps': all_apps}, outfile)
    print('{} of {} malicious apps and {} of {} benign apps had zero similarity found'.format(num_zero['malicious'], num_each['malicious'], num_zero['benign'], num_zero['benign']))
    print('Wrote data on ' + str(len(all_apps)) + ' apps to a file.')

if __name__=='__main__':
    main()
